Image Preprocessing for OCR

This ipython notebook covers the possibilities to preprocess scanned medical documents to be used with OCR software such as Tesseract.

In [1]:
import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import time
%matplotlib inline
print("OpenCV Version : %s " % cv2.__version__)
OpenCV Version : 3.2.0-dev 
In [2]:
def rotate(image, angle, scale=1.0):
    """Rotate image by angle and scale.
    """
    h, w = image.shape[:2]
    center = (w//2, h//2)
    M = cv2.getRotationMatrix2D(center, angle, scale)
    
    # fill in 255 (white) as border during rotation
    rotated = cv2.warpAffine(image, M, (w, h), borderValue=255)
    return rotated

def pad(image, h_margin=100, w_margin=100):
    """Pad 2D image by `margin` pixels on four sides.
    """
    assert len(image.shape) == 2, 'Image is not 2D!'
    h, w = image.shape
    padded = np.ones((h + 2 * h_margin, w + 2 * w_margin), dtype='uint8') * 255
    padded[h_margin : (h_margin + h), w_margin : (w_margin + w)] = image
    return padded

def order_points(pts):
    """Reorder an array of 4 coordinates.

    The reordered list is in the order of top-left, top-right, 
    bottom-right, and bottom-left.
    """
    rect = np.zeros((4, 2), dtype = "float32")

    # the top-left point will have the smallest sum
    # the bottom-right point will have the largest sum
    s = pts.sum(axis = 1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]

    # the top-right point will have the smallest difference
    # the bottom-left will have the largest difference
    diff = np.diff(pts, axis = 1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect

def four_point_transform(image, pts):
    """Perspective transformation a region of interest in image.
    """

    # reorder the points first
    rect = order_points(pts)
    (tl, tr, br, bl) = rect

    # compute the width of the new image
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    maxWidth = max(int(widthA), int(widthB))

    # compute the height of the new image
    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    maxHeight = max(int(heightA), int(heightB))

    # construct destination canvas
    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]], dtype = "float32")

    # compute the perspective transform matrix and then apply it
    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

    # return the warped image
    return warped

def box_height(box):
    """Find the height of a bounding box.
    
    Input `box` contains the coordinates of four corner points of the box.
    """
    points = order_points(box)
    height = ((points[0][0] - points[-1][0])**2 +  (points[0][1] - points[-1][1])**2)**0.5
    return height

def box_aspect_ratio(box):
    """Find the aspect ratio of a bounding box.
    
    Input `box` contains the coordinates of four corner points of the box.
    """
    points = order_points(box)
    height = ((points[0][0] - points[-1][0])**2 +  (points[0][1] - points[-1][1])**2)**0.5
    width = ((points[0][0] - points[1][0])**2 +  (points[0][1] - points[1][1])**2)**0.5
    aspect_ratio = height * 1. / width
    return aspect_ratio

Specify input images

In [3]:
filenames = [r'./testImages/1000992520057_3007_1.jpg',
             r'./testImages/1000995056687_3007_1.jpg',
             r'./testImages/1000967244029_3007_1.jpg',
             r'./testImages/1000968571699_3007_2.jpg',
             ]

Load image, convert to gray scale and apply thresholding

In [4]:
image = cv2.imread(filenames[0])

# convert to gray scale and visualize
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
matplotlib.rcParams['figure.figsize'] = (5.0, 10.0) 
plt.imshow(gray, cmap='gray')

# # Note: unfortunately equalization of histogram does not work with document images!
# equ = cv2.equalizeHist(gray)
# plt.imshow(np.hstack((gray, equ)), cmap='gray')
Out[4]:
<matplotlib.image.AxesImage at 0x1192baef0>

Use a combination of global and adaptive thresholding gives the best result

In [5]:
# Combination of global thresholding...
(T, thresh1) = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
# plt.imshow(thresh1, cmap='gray')

# And adaptive thresholding!
thresh2 = cv2.adaptiveThreshold(gray, 255, 
    cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 9)
# plt.imshow(thresh2, cmap='gray')

# bitwise OR heltps to clean up artifacts at the boundaries
thresh3 = cv2.bitwise_or(thresh1, thresh2)
matplotlib.rcParams['figure.figsize'] = (60.0, 30.0) 
plt.imshow(np.hstack((thresh1, thresh2, thresh3)), cmap='gray')
Out[5]:
<matplotlib.image.AxesImage at 0x123595a20>

Two-round process

First round performs box detection directly on the preprocessed binary image. This would normally get pretty good result. However if the scanning quality is poor, we use the first round to perform a clean-up of borders and do a second round.

In [6]:
thresh = thresh3.copy()
# inversion to make characters non-zero
image = 255 - thresh

# morph
struct_elem = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
image = cv2.dilate(image.copy(), struct_elem, iterations=2)

# find contours
im2, cnts, hierarchy = cv2.findContours(image.copy().astype(np.uint8), 
        cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = [cnt for cnt in cnts if cv2.contourArea(cnt) > 1400]

# compute and draw bounding box
canvas = thresh.copy()
c = sorted(cnts, key=cv2.contourArea, reverse=True)
boxes = []
for i in range(len(c)):
    rect = cv2.minAreaRect(c[i])
    box = np.int0(cv2.boxPoints(rect))
    boxes.append(box)
    cv2.drawContours(canvas, [box], -1, 0, 3)

matplotlib.rcParams['figure.figsize'] = (20.0, 20.0) 
plt.imshow(canvas, cmap='gray')
Out[6]:
<matplotlib.image.AxesImage at 0x111217fd0>

Now we inspect the statistics of the box locations, which gives us good information regarding font size, left border, right border, rotation angle (if the texts are tilted), etc.

Second round find the most popular left and right boundaries of bounding boxes, and then crop the original image

In [7]:
heights = []
lefts = []
rights = []
tops = []
bottoms = []
angles = []
for box in boxes:
    points = order_points(box)
    height = ((points[0][0] - points[-1][0])**2 +  (points[0][1] - points[-1][1])**2)**0.5
    heights.append(height)
    angle = np.arctan((points[1][1] - points[0][1])* 1.0 / (points[1][0] - points[0][0])) / 3.14 * 180
    angles.append(angle)
    lefts.append(points[0][0])
    rights.append(points[2][0])
    tops.append(points[0][1])
    bottoms.append(points[2][1])

# the median height of the bounding boxes is the fontsize in pixels
fontsize = int(np.median(heights))
rotate_angle = np.median(angles)
print('Font size is {} pixels.'.format(fontsize))
print('Rotate {:.2f} degrees counterclockwise to correct text tilt.'.format(rotate_angle))
Font size is 24 pixels.
Rotate 0.13 degrees counterclockwise to correct text tilt.

Preprocessed image: this can be fed into tesseract

In [8]:
lefts = sorted(lefts)
n_neighbors = [np.sum(np.abs(lefts[idx:idx+10] - lefts[idx]) < 10) for idx in range(len(lefts))]
idx = np.argmax(n_neighbors)
left_border = int(lefts[idx])

rights = sorted(rights)
n_neighbors = [np.sum(np.abs(rights[idx:idx+10] - rights[idx]) < 10) for idx in range(len(rights))]
idx = np.argmax(n_neighbors)
if (idx + 10) < len(rights):
    right_border = int(rights[idx+10])
else:
    right_border = canvas.shape[1] - 1

top_border = max(min(tops), 0)
bottom_border = max(bottoms)
print('Left {}, Right {}, Top {}, Bottom {}'.format(left_border, right_border, top_border, bottom_border))
cropped = thresh3[:, left_border:right_border]

# pad documents for visualization
cropped = pad(cropped)
plt.imshow(cropped, cmap='gray')
Left 53, Right 1156, Top 0, Bottom 1645.0
Out[8]:
<matplotlib.image.AxesImage at 0x129b3b470>

Or, alternatively, we can draw boxes with openCV directly

We can draw the bounding boxes with openCV and feed tesseract with each extracted box.

In [9]:
# inversion to make characters non-zero
image = rotate(cropped, rotate_angle)
clean_canvas = image.copy()
canvas = clean_canvas.copy()

image = 255 - image

# morph
struct_elem = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
image = cv2.dilate(image.copy(), struct_elem, iterations=2)

# find contours
im2, cnts, hierarchy = cv2.findContours(image.copy().astype(np.uint8), 
        cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = [cnt for cnt in cnts if cv2.contourArea(cnt) > 1400]

# compute and draw bounding box
c = sorted(cnts, key=cv2.contourArea, reverse=True)
boxes = []
for i in range(len(c)):
    rect = cv2.minAreaRect(c[i])
    box = np.int0(cv2.boxPoints(rect))
    if box_height(box) > fontsize * 0.5 and box_aspect_ratio(box) <= 1.1:
        # text boxes should be short and wide
        boxes.append(box)
        cv2.drawContours(canvas, [box], -1, 0, 3)


matplotlib.rcParams['figure.figsize'] = (20.0, 20.0) 
plt.imshow(canvas, cmap='gray')
Out[9]:
<matplotlib.image.AxesImage at 0x12a65a198>

The coordinates of the boxes are listed in the array boxes. To extract a certain box, simply do the following perspective transformation.

In [10]:
roi = four_point_transform(canvas.copy(), boxes[10])
plt.imshow(roi, cmap='gray')
Out[10]:
<matplotlib.image.AxesImage at 0x11f476978>

Status up to this step:

  • Image0: stamp obscures line border
  • Image1: OK!
  • Image2: One false positive near paper border
  • Image3: Two False positives near paper border. Stamp obscures multiple line

The results seem pretty good at this stage. The final results depend on whether tesseract is more susceptible to false positives (returning a box containing only artifacts but no characters) or false negatives (missing to detect and box certain characters). This would require knowledge of the entire pipeline.

Got stamps?

One particular problem we observe so far is that sometimes official stamps/seals overlap with the text area, and the above algorithm would return a box containing multiple lines of characters, as in Images0 and Image3. This may or may not be a problem depending on the OCR algorithm. In case we would like to return boxes with at most one line of characters, we can proceed to perform the following delineation.

In [11]:
canvas = clean_canvas.copy()
canvas1 = clean_canvas.copy()
oversized_boxes = []
old_boxes = []

for box in boxes:
    # Only inspect boxes with height larger than twice the fontsize
    # otherwise remove the box and its contents
    if box_height(box) <= 2 * fontsize:
        old_boxes.append(box)
        cv2.fillPoly(canvas1, [box], 255)
        cv2.fillPoly(canvas, [box], 255)
    else:
        oversized_boxes.append(box)
for box in oversized_boxes:
    cv2.drawContours(canvas1, [box], -1, 0, 3)
        
plt.imshow(canvas1, cmap='gray')
        
Out[11]:
<matplotlib.image.AxesImage at 0x11b690ac8>

Let's take a look at a specific example.

In [12]:
for box in oversized_boxes[:1]:
    roi = four_point_transform(canvas.copy(), box)
    plt.imshow(roi, cmap='gray')
In [13]:
delineate_flag = 1
image = 255 - canvas


# morph
struct_elem = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
image = cv2.dilate(image.copy(), struct_elem, iterations=2)
delineated = image.copy()

if delineate_flag:
    profile = np.sum(image, axis=1)
    profile = (profile / np.max(profile) * 255).astype(np.uint8)
    profile_img = np.array([profile, profile])
    minima_window = cv2.adaptiveThreshold(profile_img, 255, 
    cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 51, 10)
    minima_window = minima_window[1, :]

    x1 = np.where(np.diff(minima_window.astype(np.float)) > 0)[0]
    x2 = np.where(np.diff(minima_window.astype(np.float)) < 0)[0]
    # clean border area
    if x1[0] > x2[0]:
        x2 = x2[1:]
    if x1[-1] > x2[-1]:
        x1 = x1[1:]
    assert len(x1) == len(x2)

    minima = []
    for i in range(len(x1)):
        if np.min(profile[x1[i]:x2[i]]) > 0:
            minima.append(np.argmin(profile[x1[i]:x2[i]]) + x1[i])
    for pt in minima:
        delineated[pt-5:pt+5, :] = 0

    
plt.imshow(np.hstack((image, delineated)), cmap='gray')
Out[13]:
<matplotlib.image.AxesImage at 0x11eac1c88>
In [14]:
canvas = clean_canvas.copy()

# find contours
im2, cnts, hierarchy = cv2.findContours(delineated.copy().astype(np.uint8), 
        cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = [cnt for cnt in cnts if cv2.contourArea(cnt) > 1400]

# compute and draw bounding box
c = sorted(cnts, key=cv2.contourArea, reverse=True)
new_boxes = []
for i in range(len(c)):
    rect = cv2.minAreaRect(c[i])
    box = np.int0(cv2.boxPoints(rect))
    if box_height(box) > fontsize * 0.5:
        new_boxes.append(box)

# combining the old and new boxes
boxes = old_boxes + new_boxes
for box in boxes:
    cv2.drawContours(canvas, [box], -1, 0, 3)


matplotlib.rcParams['figure.figsize'] = (20.0, 20.0) 
plt.imshow(canvas, cmap='gray')
Out[14]:
<matplotlib.image.AxesImage at 0x11df8fa20>

Status up to this step:

  • Image0: Some false positives caused by residuals of stamps.
  • Image1: OK!
  • Image2: One false positive near paper border
  • Image3: Some false positives caused by residuals of stamps.

Further exploration:

Some more ideas in extracting text boxes:

  1. Signature boxes have a smaller black pixel density than text boxes. We can use this feature to classify if a box is a signature box or a text box. Or, we can extract some examples and train a relatively shallow neural network to do this.
In [ ]:
 
In [ ]: